Spline functions

Splines are functions defined as piecewise polynomials of fixed degree. The points of connections of the polynomials are called knots. Different basis expansions can be used to define a spline function.

We use a naive application on Boston dataset available in the MASS package to compare the use of polynomials and splines. The dataset collects information on housing values in suburbs of Boston and in particular we are going to model the continuous variable medv (median value of owner-occupied homes in $1000s) using a simple linear model including the continuous variable lstat (lower status of the population) as a predictor.

library(ggplot2)
data("Boston", package = "MASS")
ggplot(Boston, aes(lstat, medv) ) +
  geom_point() 

fit <- lm(medv ~ lstat, data = Boston)
par(mfrow=c(2,2))
plot(fit)

fit.poly <- lm(medv ~ lstat + I(lstat^2), data = Boston)
fit.poly2 <- lm(medv ~ poly(lstat, degree = 2, raw = TRUE), data = Boston)
summary(fit.poly2); summary(fit.poly)
## 
## Call:
## lm(formula = medv ~ poly(lstat, degree = 2, raw = TRUE), data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.2834  -3.8313  -0.5295   2.3095  25.4148 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          42.862007   0.872084   49.15   <2e-16 ***
## poly(lstat, degree = 2, raw = TRUE)1 -2.332821   0.123803  -18.84   <2e-16 ***
## poly(lstat, degree = 2, raw = TRUE)2  0.043547   0.003745   11.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.524 on 503 degrees of freedom
## Multiple R-squared:  0.6407, Adjusted R-squared:  0.6393 
## F-statistic: 448.5 on 2 and 503 DF,  p-value: < 2.2e-16
## 
## Call:
## lm(formula = medv ~ lstat + I(lstat^2), data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.2834  -3.8313  -0.5295   2.3095  25.4148 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 42.862007   0.872084   49.15   <2e-16 ***
## lstat       -2.332821   0.123803  -18.84   <2e-16 ***
## I(lstat^2)   0.043547   0.003745   11.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.524 on 503 degrees of freedom
## Multiple R-squared:  0.6407, Adjusted R-squared:  0.6393 
## F-statistic: 448.5 on 2 and 503 DF,  p-value: < 2.2e-16
fit.poly5 <- lm(medv ~ poly(lstat, 5), data = Boston)

ggplot(Boston, aes(lstat, medv) ) +
  geom_point() +
  stat_smooth(method = lm, formula = y ~ poly(x, 5, raw = TRUE))

library(splines)
knots <- quantile(Boston$lstat)[2:4]
fit.spline <- lm (medv ~ bs(lstat, knots = knots), data = Boston)
summary(fit.spline)
## 
## Call:
## lm(formula = medv ~ bs(lstat, knots = knots), data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.8071  -3.1502  -0.7389   2.1076  26.9529 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 50.628      2.582  19.608  < 2e-16 ***
## bs(lstat, knots = knots)1  -13.682      3.886  -3.521  0.00047 ***
## bs(lstat, knots = knots)2  -26.684      2.449 -10.894  < 2e-16 ***
## bs(lstat, knots = knots)3  -28.416      2.917  -9.743  < 2e-16 ***
## bs(lstat, knots = knots)4  -40.092      3.050 -13.144  < 2e-16 ***
## bs(lstat, knots = knots)5  -39.718      4.212  -9.431  < 2e-16 ***
## bs(lstat, knots = knots)6  -38.484      4.134  -9.308  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.206 on 499 degrees of freedom
## Multiple R-squared:  0.6833, Adjusted R-squared:  0.6795 
## F-statistic: 179.5 on 6 and 499 DF,  p-value: < 2.2e-16
fit.spline <- lm (medv ~ bs(lstat, df = 6), data = Boston)
summary(fit.spline)
## 
## Call:
## lm(formula = medv ~ bs(lstat, df = 6), data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.8071  -3.1502  -0.7389   2.1076  26.9529 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          50.628      2.582  19.608  < 2e-16 ***
## bs(lstat, df = 6)1  -13.682      3.886  -3.521  0.00047 ***
## bs(lstat, df = 6)2  -26.684      2.449 -10.894  < 2e-16 ***
## bs(lstat, df = 6)3  -28.416      2.917  -9.743  < 2e-16 ***
## bs(lstat, df = 6)4  -40.092      3.050 -13.144  < 2e-16 ***
## bs(lstat, df = 6)5  -39.718      4.212  -9.431  < 2e-16 ***
## bs(lstat, df = 6)6  -38.484      4.134  -9.308  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.206 on 499 degrees of freedom
## Multiple R-squared:  0.6833, Adjusted R-squared:  0.6795 
## F-statistic: 179.5 on 6 and 499 DF,  p-value: < 2.2e-16
ggplot(Boston, aes(lstat, medv) ) +
  geom_point() +
  stat_smooth(method = lm, formula = y ~ poly(x, 3, raw = TRUE))+
  stat_smooth(method = lm, formula = y ~ splines::bs(x, df = 6), col="red")+
  stat_smooth(method = lm, formula = y ~ splines::bs(x, df = 100), col="green")

The next plot shows a comparison of 3 spline functions with 1 knot on the median of the predictor and different spline degrees.

ggplot(Boston, aes(lstat, medv) ) +
  geom_point() +
  stat_smooth(method = lm, formula = y ~ splines::bs(x, knots = median(Boston$lstat)))+
  stat_smooth(method = lm, formula = y ~ splines::bs(x, knots = median(Boston$lstat),
                                                     degree=2),col="red")+
    stat_smooth(method = lm, formula = y ~ splines::bs(x, knots = median(Boston$lstat),
                                                     degree=1),col="green")

Generalized additive models (GAM)

A generalized additive model (GAM) is a generalized linear model (GLM) in which the linear predictor is given by a user specified sum of smooth functions of the covariates plus a conventional parametric component of the linear predictor. Given \(\mu_i=E[Y_i]\), a simple example is:

\[\log(\mu_i)= \beta_0+\sum_{j=1}^{p}s_j(x_{ij})+\epsilon_i, \ i=1,\ldots,n,\] where the dependent variable \(Y_{i} \sim \mathsf{Gamma}(\mu_i, \phi)\) and \(s_j\) are smooth functions of covariates \(x_j\). GAM models imply the following likelihood penalization

\[ l(\boldsymbol{\beta}, \boldsymbol{s})-\lambda R(\boldsymbol{s}),\]

where \(R(\boldsymbol{s})\) is a measure of roughness.

Trees dataset

As an example, consider the simple dataset trees, where we have measurements of the girth, height and volume of timber in 31 felled black cherry trees. Note that girth is the diameter of the tree (in inches) measured at 4 ft 6 in above the ground. First of all, we use the pairs plot for checking possible correlations between the three variables.


The model

In order to give a general idea about the GAM flexibility, we may start with one predictor. We could fit a simple Generalized Linear Model (GLM), and a GAM model \(Y_{i} \sim \mathsf{Gamma}(\mu_i, \phi)\), such that:

\[\begin{align*} \log(\mu_i)= & \beta_0+\beta_1 \mathsf{Height}_i\\ \log(\mu_i)= & \beta_0+ s_1(\mathsf{Height}_i), \end{align*}\]

for \(i=1,\ldots,n\). The question we pose is whether the GLM structure is good enough for fitting these data. We could start by printing the output of the GAM model; then, we may plot the predicted values for both the models.

glm.1 <- glm(Volume ~ Height, family = Gamma(link=log), data=trees)
gam.1 <- gam(Volume ~ s(Height), family=Gamma(link=log), data=trees)

summary(gam.1)
## 
## Family: Gamma 
## Link function: log 
## 
## Formula:
## Volume ~ s(Height)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.34970    0.07236   46.29   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df     F  p-value    
## s(Height)   1      1 22.97 4.56e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.348   Deviance explained = 42.6%
## GCV = 0.17591  Scale est. = 0.1623    n = 31
plot(trees$Height, trees$Volume, xlab="Height", ylab="Fitted values")
points(trees$Height, glm.1$fitted.values, col="blue", bg=3, pch=19)
points(trees$Height, gam.1$fitted.values, col="red", bg=2, pch=19, cex=.5)
legend("topright", c("GLM","GAM"), pch=19, col=c("blue", "red"))

Diagnostic plot involve the representation of the smooth function and the partial residuals defined as: \[\hat\epsilon^{part}_{ij}=\hat s_j(x_{ij})+\hat\epsilon_i^{P}\] where \(\hat\epsilon^{P}\) are the Pearson residuals of the model. Looking at this plot we are interested in noticing non linearity or wiggle behavior of the smooth function and if the partial residuals are evenly distributed around the function.

plot(gam.1,residuals=TRUE,pch=19)

By the joint analysis of the summary output—look at the value edf=1—the prediction plots and the final plot displaying \(s(\mathsf{Height})\) versus \(\mathsf{Height}\), we may conclude that this simple single predictor framework may be well explained by a GLM approach.

Adding another predictor

Consider now to add the predictor \(\mathsf{Girth}\), and take the same model as before, \(Y_{i}=\mathsf{Volume}_i\sim \mathsf{Gamma}(\mu_i, \phi)\), then:

\[\log(\mu_i)=s_1(\mathsf{Height}_i)+s_2(\mathsf{Girth}_i).\] Analogously as before, we fit the GLM and the GAM model. We plot together the fitted values and we separately analyze the smooth terms \(s_1(\mathsf{Height})\) and \(s_2(\mathsf{Girth})\).

glm.2<-glm(Volume ~ Girth + Height, family = Gamma(link=log), data=trees)
gam.2 <- gam(Volume ~ s(Height) + s(Girth), family=Gamma(link=log), data=trees)
summary(gam.2)
## 
## Family: Gamma 
## Link function: log 
## 
## Formula:
## Volume ~ s(Height) + s(Girth)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.27570    0.01492   219.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##             edf Ref.df      F  p-value    
## s(Height) 1.000  1.000  31.32 7.07e-06 ***
## s(Girth)  2.422  3.044 219.28  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.973   Deviance explained = 97.8%
## GCV = 0.0080824  Scale est. = 0.006899  n = 31


Options to visualize the fitted surface are collected in the function vis.gam which offers perspective and contour plot for a gam model.

par(mfrow=c(1,2))
vis.gam(gam.2,theta=-45,type = "response", color="terrain")
vis.gam(gam.2,theta=-45,type = "link", color="terrain")

par(mfrow=c(1,2))
vis.gam(gam.2,type = "response", color="terrain", plot.type = "contour")
vis.gam(gam.2,type = "link", color="terrain", plot.type = "contour")

The output reports an edf equals 2.422 for the \(\mathsf{Girth}\) predictor, meaning that a straight line could be partially inappropriate for our purposes. As a further check, we may compute the AIC for the four proposed models:

AIC(glm.1, gam.1, glm.2, gam.2)
##             df      AIC
## glm.1 3.000000 239.6724
## gam.1 3.000009 239.6724
## glm.2 4.000000 151.0081
## gam.2 5.422254 142.8559

The AIC for the single predictor models coincide. The AIC dramatically decreases when considering the second predictor, with the GAM model favorite over the GLM model.

An issue we should consider is the so called optimal degree of smoothness. With the argument sp in the gam function we can manually control the degree of smoothness for each smooth function included in the model.

#unpenalized regression spline
gam.2.1 <- gam(Volume ~ s(Height, k=10, fx=TRUE)+
                 s(Girth, k=10, fx=TRUE),
               family=Gamma(link=log), data=trees)
gam.2.1$sp
## named numeric(0)
summary(gam.2.1)
## 
## Family: Gamma 
## Link function: log 
## 
## Formula:
## Volume ~ s(Height, k = 10, fx = TRUE) + s(Girth, k = 10, fx = TRUE)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.27370    0.01261   259.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df      F  p-value    
## s(Height)   9      9  5.885  0.00295 ** 
## s(Girth)    9      9 85.247 1.72e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.983   Deviance explained = 99.3%
## GCV = 0.012894  Scale est. = 0.0049312  n = 31
par(mfrow=c(1,2))
plot(gam.2.1,residuals=TRUE,pch=19)

par(mfrow=c(1,1))
vis.gam(gam.2.1,theta=-45,type = "link", color="terrain")

However, usually \(\lambda\) may be estimated by several methods, such as CV, AIC, BIC…By default, gam function estimates this quantity via AIC. Let’s take a look explicitly at the AIC and how \(\lambda\) is selected.

#extract the values
sp <- gam.2$sp

tuning.scale<-c(1e-5,1e-4,1e-3,1e-2,1e-1,1e0,1e1,1e2,1e3,1e4,1e5) 
scale.exponent <- log10(tuning.scale) 
n.tuning <- length(tuning.scale) 

minus2loglik <- rep(NA,n.tuning) 
edf <- rep(NA,n.tuning)  
aic <- rep(NA,n.tuning)  


for (i in 1:n.tuning) {   
  gamobj <- gam(Volume ~ s(Height) + s(Girth), family=Gamma(link=log),
                data=trees, sp=tuning.scale[i]*sp) 
  minus2loglik[i] <- -2*logLik(gamobj) 
  edf[i]          <- sum(gamobj$edf)+1  
  aic[i]          <- AIC(gamobj)
}

par(mfrow=c(2,2)) 
plot(scale.exponent,minus2loglik,type="b",main="-2 log likelihood") 
plot(scale.exponent,edf,ylim=c(0,70),type="b",main="effective number of parameters")  
plot(scale.exponent,aic,type="b",main="AIC build-in function") 
plot(scale.exponent,minus2loglik+2*edf,type="b",main="AIC") 

# find the minimum
opt.tuning.scale <- tuning.scale[which.min(aic)]
opt.tuning.scale
## [1] 1
opt.sp<-opt.tuning.scale*sp

# fitting the final model with the optimal level of smoothing

gam.2.opt <- gam(Volume ~ s(Height)+s(Girth), family=Gamma(link=log),data=trees,
                 sp=opt.sp)

AIC(gam.2, gam.2.opt)
##                 df      AIC
## gam.2     5.422254 142.8559
## gam.2.opt 5.422254 142.8559

It’s your turn!

1- Simulate some Poisson data with the gamSim function contained in the mgcv package (Use the default arguments eg=1 for the Gu and Wahba 4 univariate term example and scale=0.1).

2- Fit a Gam and print the results. Interpret the fit.

3- Produce some plots for each \(x_j\) plotted against \(s(x_j)\), and comment. Do you maybe drop some covariates?

4- Compute the AIC between your final gam and a glm.

Tree-based classification

We consider the example of detection of email spam. Data are available from the UCI repository of machine learning databases (http://www.ics.uci.edu/~mlearn/MLRepository.html) and they collect informations about 4601 email items, 1813 classified as spam. 57 explanatory variables describe several characteristics of the data. Following the example in the Data Analyisis and Graphics using R, we will consider only 6 of them, mostly related to the frequency of specific words and characters in the email. In detail,

Using these 6 explanatory variables we want to build an decision tree model that is able to classify each email correctly as spam (y in the binary outcome variable yesno) and non-spam (n in the binary outcome variable yesno).

library(reshape2)
library(rpart)
library(rpart.plot)
spam <- read.table("spambase.data", sep=",")
spam <- spam[,c(58,#"yesno"
                57,#"crl.tot"
                53,#"dollar" 
                52,#"bang"
                24,#"money" 
                23,#"n000"
                1  #"make"
                )]
colnames(spam) <- c("yesno","crl.tot","dollar","bang", "money","n000", "make")
spam$yesno <- factor(spam$yesno, levels = c(0, 1))
levels(spam$yesno) <- c("n","y")

par(mfrow=c(2,3))
for(i in 2:7){
  boxplot(spam[,i]~yesno, data=spam, ylab = colnames(spam)[i])
}

#summary(glm(yesno ~ crl.tot + dollar + bang + money + n000 + log(make+.5),
#            family=binomial, data=spam))

The model can be fitted using the function rpart in the library rpart. The option class in the method is selected by default when the outcome variable is of type factor.

library(rpart)

spam.rpart <- rpart(yesno ~ crl.tot + dollar + bang + money + n000 + make,
                    method="class", data=spam) 
plot(spam.rpart) # Draw tree
text(spam.rpart) # Add labeling

The summary of the fitted decision tree is visualized using the function printcp.

printcp(spam.rpart)
## 
## Classification tree:
## rpart(formula = yesno ~ crl.tot + dollar + bang + money + n000 + 
##     make, data = spam, method = "class")
## 
## Variables actually used in tree construction:
## [1] bang    crl.tot dollar 
## 
## Root node error: 1813/4601 = 0.39404
## 
## n= 4601 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.476558      0   1.00000 1.00000 0.018282
## 2 0.075565      1   0.52344 0.55378 0.015453
## 3 0.011583      3   0.37231 0.38334 0.013398
## 4 0.010480      4   0.36073 0.38720 0.013453
## 5 0.010000      5   0.35025 0.38114 0.013366

The complexity parameter CP is a proxy for the number of splits. In order to avoid too complex trees, the reduction of lack-of-fit for each additional split is evaluated with an increasing cost. When the cost outweights the reduction, the algorithm stops.

Small complexity parameter CP leads to large tree and large CPleads to small tree. Let’s try to decrease the default CP value for our model.

spam.rpart0001 <- rpart(yesno ~ crl.tot + dollar + bang + money + n000 + make,
                    method="class", data=spam, cp = 0) 
printcp(spam.rpart0001)
## 
## Classification tree:
## rpart(formula = yesno ~ crl.tot + dollar + bang + money + n000 + 
##     make, data = spam, method = "class", cp = 0)
## 
## Variables actually used in tree construction:
## [1] bang    crl.tot dollar  make    money   n000   
## 
## Root node error: 1813/4601 = 0.39404
## 
## n= 4601 
## 
##            CP nsplit rel error  xerror     xstd
## 1  4.7656e-01      0   1.00000 1.00000 0.018282
## 2  7.5565e-02      1   0.52344 0.54385 0.015352
## 3  1.1583e-02      3   0.37231 0.39272 0.013531
## 4  1.0480e-02      4   0.36073 0.38720 0.013453
## 5  6.3431e-03      5   0.35025 0.37176 0.013229
## 6  5.5157e-03     10   0.31660 0.35962 0.013048
## 7  4.4126e-03     11   0.31109 0.34639 0.012844
## 8  3.8610e-03     12   0.30667 0.34142 0.012767
## 9  2.7579e-03     16   0.29123 0.32708 0.012536
## 10 2.2063e-03     17   0.28847 0.32267 0.012464
## 11 1.9305e-03     18   0.28627 0.32377 0.012482
## 12 1.6547e-03     20   0.28240 0.32432 0.012491
## 13 9.1929e-04     25   0.27413 0.32708 0.012536
## 14 8.2736e-04     29   0.26917 0.32819 0.012554
## 15 5.5157e-04     46   0.25427 0.33094 0.012599
## 16 3.3094e-04     48   0.25317 0.33646 0.012688
## 17 2.7579e-04     53   0.25152 0.33756 0.012705
## 18 1.8386e-04     61   0.24931 0.33756 0.012705
## 19 6.8946e-05     64   0.24876 0.34197 0.012775
## 20 0.0000e+00     72   0.24821 0.34253 0.012784

The relative error showed in the summary decreases at any additional split, so it is not useful to evaluate the predictive accuracy of the model, while the xerror (which stands for cross-validated relative error) reaches a minimum. Since the xerror is computed using 10-fold cross-validation procedure by default, the values slightly changes everytime we fit a new model. The relative error remains exactly the same.

plotcp(spam.rpart0001)

Now that we have a very large (overfitted) tree, what is the best number of splits where to prune our tree? Changes in the xerrors are so small that running the model again would probably lead to a different choice of number of splits if it is based on selecting the tree with the absolute minimum xerror. To reduce instabilty in the choice we can use the one-standard-deviation rule. The horizontal dashed line in the plot shows the minimum xerror + standard deviation. So choosing the smallest tree whose xerror is less or equal than this value will lead us to a more conservative choice if the interest is in choosing the optimal predictive tree, i.e., the predictive power will on average be slightly less than optimal.

best.cp <- spam.rpart0001$cptable[which.min(spam.rpart0001$cptable[,"xerror"]),]
best.cp
##           CP       nsplit    rel error       xerror         xstd 
##  0.002206288 17.000000000  0.288472146  0.322669608  0.012463811
sd.rule <- best.cp["xerror"]+best.cp["xstd"]
cptable.sd.rule <- spam.rpart0001$cptable[spam.rpart0001$cptable[,"xerror"]<=sd.rule,]
best.cp.sd <- cptable.sd.rule[which.min(cptable.sd.rule[,"nsplit"]),]
best.cp.sd
##          CP      nsplit   rel error      xerror        xstd 
##  0.00275786 16.00000000  0.29123001  0.32708218  0.01253624
tree.pruned <- prune(spam.rpart0001, cp=best.cp[1])
rpart.plot(tree.pruned, extra=106, box.palette="GnBu",branch.lty=3, shadow.col="gray", nn=TRUE)

tree.pruned.sd <- prune(spam.rpart0001, cp=best.cp.sd[1])
printcp(tree.pruned.sd)
## 
## Classification tree:
## rpart(formula = yesno ~ crl.tot + dollar + bang + money + n000 + 
##     make, data = spam, method = "class", cp = 0)
## 
## Variables actually used in tree construction:
## [1] bang    crl.tot dollar  money   n000   
## 
## Root node error: 1813/4601 = 0.39404
## 
## n= 4601 
## 
##          CP nsplit rel error  xerror     xstd
## 1 0.4765582      0   1.00000 1.00000 0.018282
## 2 0.0755654      1   0.52344 0.54385 0.015352
## 3 0.0115830      3   0.37231 0.39272 0.013531
## 4 0.0104799      4   0.36073 0.38720 0.013453
## 5 0.0063431      5   0.35025 0.37176 0.013229
## 6 0.0055157     10   0.31660 0.35962 0.013048
## 7 0.0044126     11   0.31109 0.34639 0.012844
## 8 0.0038610     12   0.30667 0.34142 0.012767
## 9 0.0027579     16   0.29123 0.32708 0.012536
rpart.plot(tree.pruned.sd, extra=106, box.palette="GnBu",branch.lty=3, shadow.col="gray", nn=TRUE)

Absolute cross validation error for this last model is: \(0.33756*0.39404=0.1330121\), thus the model achieved an error rate of 13.3%.

Trying with RandomForest..

library(randomForest)
spam.rf <- randomForest(yesno ~ ., data=spam, importance=TRUE) 
print(spam.rf)
## 
## Call:
##  randomForest(formula = yesno ~ ., data = spam, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 11.58%
## Confusion matrix:
##      n    y class.error
## n 2644  144  0.05164993
## y  389 1424  0.21456150
importance(spam.rf)
##                n         y MeanDecreaseAccuracy MeanDecreaseGini
## crl.tot 49.36727  54.83830             71.26223        251.48548
## dollar  54.67793  52.12678             72.15885        429.75399
## bang    90.99155 104.15714            114.76066        594.92500
## money   31.31415  49.96745             51.67082        196.82756
## n000    59.49803  14.89324             62.89929        120.11941
## make    14.15852  20.80571             24.78125         42.11846